# chinopen-prep.R
# Prepares data set for STM and other analyses on textual data on climate change/ air pollution
# collected by Eric Jamelske's team in Xi'an and Chengdu, summer 2015. 
# Endre Tvinnereim, 2015-2016

library(XLConnect)
# Clear all
rm(list=ls(all=TRUE))
# Load the data: segmented text using Stanford NLP
data <- readWorksheet(loadWorkbook("CN_segmented_text_for_R.xlsx"),sheet=1)
data <- read.xlsx("CN_segmented_text_for_R.xlsx",1)

# Load the substitution list
subs <- readWorksheet(loadWorkbook("chinopen-substitutions-lean.xlsx"), sheet=1)
# Now loop through the textual answers and the subsitutions list and make substitutions
oa <- data$segm_pku
for(i in 1:length(oa)) {
  for(j in 1:length(subs$orig)) { 
    oa[[i]] <- gsub(subs$orig[j], subs$replace[j], oa[[i]])  # bespoke spelling corrections, spacing, standardization
  }
} 
data$proc.oa <- oa  # where "proc.oa" means "processed open answers vector" 

# Do the same for CTB segmenting. Purpose: robustness check
oa <- data$segm_ctb
for(i in 1:length(oa)) {
  for(j in 1:length(subs$orig)) { 
    oa[[i]] <- gsub(subs$orig[j], subs$replace[j], oa[[i]])  # bespoke spelling corrections, spacing, standardization
  }
} 
data$proc.ctb <- oa  # processed open answers vector with alternative CTB segmenting method" 

# merge original text, processed text, translations, and remaining data set
# read in the translations  (minor note: "deleteline2" means that line 2, containing variable numbers, 
# was deleted from the original. All else is kept. Do this for other files of the same type)
translations.all <- readWorksheet(loadWorkbook
                              ("CC_CH_Text_Response_Transcribe_Translate_All_deleteline2_v3.xlsx"),
                              sheet=2)
# Use only ID and translations to English
translations <- translations.all[c(1,2)]

cn.alldata <- merge(data, translations,  by = "ID")

# Read in metadata, csv format
library(foreign)
num.chin.data <- read.csv("CC_China_2015_Adult_Final_9.07.15_cutduplicate88.csv", sep=";")
names(num.chin.data)[names(num.chin.data)=="�..ID"] <- "ID"

# read in climate/air binary vector
climair.vector <- readWorksheet(loadWorkbook("Open_End_Question_Id_Vector.xlsx"),sheet=1)
# Test it
summary(climair.vector$climchg+climair.vector$airpol)  # should have max and min at 1: all should be one of them but no more than one. 
cn.alldata <- merge(cn.alldata, num.chin.data,  by = "ID")
cn.alldata <- merge(cn.alldata, climair.vector, by.x="ID", by.y="id")

# then save it all
save(list = ls(all = TRUE), file = "chinopen-for-frequencies.RData")
# and save the relevant data frame alone
save("cn.alldata", file="cn_alldata.RData")

# next file: "freqtable.R"
# OR: "china-stm.R" 
